#Removing previous datasets in memory
rm(list = ls())
#Loading the relevant libraries
library(ggplot2)
library(gridExtra)
library(dplyr)
library(purrr)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)Lab 8: Statistics
T-Tests, Choropleth Maps, and Correlations
1 Intro
In this lab session, we delve into the practical application of T-tests, correlations, and mapping techniques.
From our lecture, we gained insights into Z-tests, which assess differences between two groups when the standard deviation or variance is known. Typically, a sample size larger than 30 is recommended to perform a z-test. However, in many instances, the population standard deviation is unknown, leading us to rely more frequently on t-tests. To elucidate the mechanics of t-tests, let’s revisit our sample of Latin American countries and compare them with global data.
2 Loading the Data
Let us go back to the old Life Expectancy dataset. If you don’t have it anymore, you can download Life Expectancy and urbanization from the following links:
Let us re-examine the distribution of our life expectancy dataset by making a histogram. We need to first load the data:
#Setting path
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/lab8/")
#Step1: Loading the data
life_expectancy_df <- read.csv(file = './life-expectancy.csv')
urbanization_df <- read.csv(file = './share-of-population-urban.csv')3 Cleaning the Data
In the next few lines we focus on the period starting with 1900, we average life expectancy over country (the original dataset is a panel - with countries and years), we eliminate entities which do not have country codes or have strange ones. By taking the average by country, we are getting rid of the time component.
#Step1: Selecting after 1900
life_expectancy_df<-subset(life_expectancy_df, Year>1900)
#Step2: Calculating the mean
life_expectancy_df2<-life_expectancy_df%>%
dplyr::group_by(Entity, Code)%>%
dplyr::summarize(life_exp_mean=mean(Life.expectancy.at.birth..historical.))
#Step3: Cleaning the Data
weird_labels <- c("OWID_KOS", "OWID_WRL", "")
clean_life_expectancy_df<-subset(life_expectancy_df2, !(Code %in% weird_labels))Let us look at our data. We have learned so far to use head.
head(clean_life_expectancy_df, n=5)We can also use potentially a more powerful function to look at our data. This is called glimpse.
glimpse(clean_life_expectancy_df)Rows: 235
Columns: 3
Groups: Entity [235]
$ Entity <chr> "Afghanistan", "Albania", "Algeria", "American Samoa", "…
$ Code <chr> "AFG", "ALB", "DZA", "ASM", "AND", "AGO", "AIA", "ATG", …
$ life_exp_mean <dbl> 45.38333, 68.28611, 57.53013, 68.63750, 77.04861, 45.084…
As you can see, this tells us exactly how many observations (235), how many columns/variables (3) we have, the first values that these variables take, and the type of variables that we are dealing with: in this case, character and integers. We should consider using glimpse from now on to understand our data better.
4 Mapping the Data
We will now try to map the data. The package rnaturalearth provides the geographic data for all countries in the world. Use ne_countries to pull country data and choose the scale for how detailed your want your maps to be. Larger scales take more space and time to process, while smaller scales take less space and less time to process. (rnaturalearthhires is necessary for scale = “large”).
world <- ne_countries(scale = "medium", returnclass = "sf")We are now working with a different object. The data that we just cleaned - clean_life_expectancy_df, falls under three different categories of objects: data frame, grouped data frame, a tibble, and a dataframe.
class(clean_life_expectancy_df)[1] "grouped_df" "tbl_df" "tbl" "data.frame"
Tibble is the central data structure for the set of packages known as the tidyverse, which include dplyr, ggplot2, tidyr, and readr. The general ethos is that tibbles are lazy and surly: they do less and complain more than base data.frames. This forces problems to be tackled earlier and more explicitly, typically leading to code that is more expressive and robust.
If we look at the class for our new object world, it is a different class.
class(world)[1] "sf" "data.frame"
glimpse(world)Rows: 241
Columns: 64
$ scalerank <int> 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 3, 1, 5, 3, 1, 1, 1, 1, 1, 1,…
$ featurecla <chr> "Admin-0 country", "Admin-0 country", "Admin-0 country", "A…
$ labelrank <dbl> 5, 3, 3, 6, 6, 6, 6, 4, 2, 6, 4, 4, 5, 6, 6, 2, 4, 5, 6, 2,…
$ sovereignt <chr> "Netherlands", "Afghanistan", "Angola", "United Kingdom", "…
$ sov_a3 <chr> "NL1", "AFG", "AGO", "GB1", "ALB", "FI1", "AND", "ARE", "AR…
$ adm0_dif <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,…
$ level <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ type <chr> "Country", "Sovereign country", "Sovereign country", "Depen…
$ admin <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ adm0_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ geou_dif <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ geounit <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ gu_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ su_dif <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ subunit <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ su_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ brk_diff <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ name <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ name_long <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ brk_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ brk_name <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ brk_group <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ abbrev <chr> "Aruba", "Afg.", "Ang.", "Ang.", "Alb.", "Aland", "And.", "…
$ postal <chr> "AW", "AF", "AO", "AI", "AL", "AI", "AND", "AE", "AR", "ARM…
$ formal_en <chr> "Aruba", "Islamic State of Afghanistan", "People's Republic…
$ formal_fr <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ note_adm0 <chr> "Neth.", NA, NA, "U.K.", NA, "Fin.", NA, NA, NA, NA, "U.S.A…
$ note_brk <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Multiple claim…
$ name_sort <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ name_alt <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ mapcolor7 <dbl> 4, 5, 3, 6, 1, 4, 1, 2, 3, 3, 4, 4, 1, 7, 2, 1, 3, 1, 2, 3,…
$ mapcolor8 <dbl> 2, 6, 2, 6, 4, 1, 4, 1, 1, 1, 5, 5, 2, 5, 2, 2, 1, 6, 2, 2,…
$ mapcolor9 <dbl> 2, 8, 6, 6, 1, 4, 1, 3, 3, 2, 1, 1, 2, 9, 5, 2, 3, 5, 5, 1,…
$ mapcolor13 <dbl> 9, 7, 1, 3, 6, 6, 8, 3, 13, 10, 1, NA, 7, 11, 5, 7, 4, 8, 8…
$ pop_est <dbl> 103065, 28400000, 12799293, 14436, 3639453, 27153, 83888, 4…
$ gdp_md_est <dbl> 2258.0, 22270.0, 110300.0, 108.9, 21810.0, 1563.0, 3660.0, …
$ pop_year <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ lastcensus <dbl> 2010, 1979, 1970, NA, 2001, NA, 1989, 2010, 2010, 2001, 201…
$ gdp_year <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ economy <chr> "6. Developing region", "7. Least developed region", "7. Le…
$ income_grp <chr> "2. High income: nonOECD", "5. Low income", "3. Upper middl…
$ wikipedia <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ fips_10 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ iso_a2 <chr> "AW", "AF", "AO", "AI", "AL", "AX", "AD", "AE", "AR", "AM",…
$ iso_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALA", "AND", "ARE", "AR…
$ iso_n3 <chr> "533", "004", "024", "660", "008", "248", "020", "784", "03…
$ un_a3 <chr> "533", "004", "024", "660", "008", "248", "020", "784", "03…
$ wb_a2 <chr> "AW", "AF", "AO", NA, "AL", NA, "AD", "AE", "AR", "AM", "AS…
$ wb_a3 <chr> "ABW", "AFG", "AGO", NA, "ALB", NA, "ADO", "ARE", "ARG", "A…
$ woe_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ adm0_a3_is <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALA", "AND", "ARE", "AR…
$ adm0_a3_us <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ adm0_a3_un <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ adm0_a3_wb <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ continent <chr> "North America", "Asia", "Africa", "North America", "Europe…
$ region_un <chr> "Americas", "Asia", "Africa", "Americas", "Europe", "Europe…
$ subregion <chr> "Caribbean", "Southern Asia", "Middle Africa", "Caribbean",…
$ region_wb <chr> "Latin America & Caribbean", "South Asia", "Sub-Saharan Afr…
$ name_len <dbl> 5, 11, 6, 8, 7, 5, 7, 20, 9, 7, 14, 10, 23, 22, 17, 9, 7, 1…
$ long_len <dbl> 5, 11, 6, 8, 7, 13, 7, 20, 9, 7, 14, 10, 27, 35, 19, 9, 7, …
$ abbrev_len <dbl> 5, 4, 4, 4, 4, 5, 4, 6, 4, 4, 9, 4, 7, 10, 6, 4, 5, 4, 4, 5…
$ tiny <dbl> 4, NA, NA, NA, NA, 5, 5, NA, NA, NA, 3, NA, NA, 2, 4, NA, N…
$ homepart <dbl> NA, 1, 1, NA, 1, NA, 1, 1, 1, 1, NA, 1, NA, NA, 1, 1, 1, 1,…
$ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-69.89912 1..., MULTIPOLYGON (…
At its most basic, an sf object is a collection of simple features that includes attributes and geometries in the form of a data frame. In other words, it is a data frame (or tibble) with rows of features, columns of attributes, and a special geometry column that contains the spatial aspects of the features.
We can use this format to make maps.
ggplot() +
geom_sf(data = world)But let us also look more closely at the attribute table of this spatial dataframe.
glimpse(world)Rows: 241
Columns: 64
$ scalerank <int> 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 3, 1, 5, 3, 1, 1, 1, 1, 1, 1,…
$ featurecla <chr> "Admin-0 country", "Admin-0 country", "Admin-0 country", "A…
$ labelrank <dbl> 5, 3, 3, 6, 6, 6, 6, 4, 2, 6, 4, 4, 5, 6, 6, 2, 4, 5, 6, 2,…
$ sovereignt <chr> "Netherlands", "Afghanistan", "Angola", "United Kingdom", "…
$ sov_a3 <chr> "NL1", "AFG", "AGO", "GB1", "ALB", "FI1", "AND", "ARE", "AR…
$ adm0_dif <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,…
$ level <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ type <chr> "Country", "Sovereign country", "Sovereign country", "Depen…
$ admin <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ adm0_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ geou_dif <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ geounit <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ gu_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ su_dif <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ subunit <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ su_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ brk_diff <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ name <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ name_long <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ brk_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ brk_name <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ brk_group <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ abbrev <chr> "Aruba", "Afg.", "Ang.", "Ang.", "Alb.", "Aland", "And.", "…
$ postal <chr> "AW", "AF", "AO", "AI", "AL", "AI", "AND", "AE", "AR", "ARM…
$ formal_en <chr> "Aruba", "Islamic State of Afghanistan", "People's Republic…
$ formal_fr <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ note_adm0 <chr> "Neth.", NA, NA, "U.K.", NA, "Fin.", NA, NA, NA, NA, "U.S.A…
$ note_brk <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Multiple claim…
$ name_sort <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ name_alt <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ mapcolor7 <dbl> 4, 5, 3, 6, 1, 4, 1, 2, 3, 3, 4, 4, 1, 7, 2, 1, 3, 1, 2, 3,…
$ mapcolor8 <dbl> 2, 6, 2, 6, 4, 1, 4, 1, 1, 1, 5, 5, 2, 5, 2, 2, 1, 6, 2, 2,…
$ mapcolor9 <dbl> 2, 8, 6, 6, 1, 4, 1, 3, 3, 2, 1, 1, 2, 9, 5, 2, 3, 5, 5, 1,…
$ mapcolor13 <dbl> 9, 7, 1, 3, 6, 6, 8, 3, 13, 10, 1, NA, 7, 11, 5, 7, 4, 8, 8…
$ pop_est <dbl> 103065, 28400000, 12799293, 14436, 3639453, 27153, 83888, 4…
$ gdp_md_est <dbl> 2258.0, 22270.0, 110300.0, 108.9, 21810.0, 1563.0, 3660.0, …
$ pop_year <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ lastcensus <dbl> 2010, 1979, 1970, NA, 2001, NA, 1989, 2010, 2010, 2001, 201…
$ gdp_year <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ economy <chr> "6. Developing region", "7. Least developed region", "7. Le…
$ income_grp <chr> "2. High income: nonOECD", "5. Low income", "3. Upper middl…
$ wikipedia <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ fips_10 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ iso_a2 <chr> "AW", "AF", "AO", "AI", "AL", "AX", "AD", "AE", "AR", "AM",…
$ iso_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALA", "AND", "ARE", "AR…
$ iso_n3 <chr> "533", "004", "024", "660", "008", "248", "020", "784", "03…
$ un_a3 <chr> "533", "004", "024", "660", "008", "248", "020", "784", "03…
$ wb_a2 <chr> "AW", "AF", "AO", NA, "AL", NA, "AD", "AE", "AR", "AM", "AS…
$ wb_a3 <chr> "ABW", "AFG", "AGO", NA, "ALB", NA, "ADO", "ARE", "ARG", "A…
$ woe_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ adm0_a3_is <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALA", "AND", "ARE", "AR…
$ adm0_a3_us <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ adm0_a3_un <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ adm0_a3_wb <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ continent <chr> "North America", "Asia", "Africa", "North America", "Europe…
$ region_un <chr> "Americas", "Asia", "Africa", "Americas", "Europe", "Europe…
$ subregion <chr> "Caribbean", "Southern Asia", "Middle Africa", "Caribbean",…
$ region_wb <chr> "Latin America & Caribbean", "South Asia", "Sub-Saharan Afr…
$ name_len <dbl> 5, 11, 6, 8, 7, 5, 7, 20, 9, 7, 14, 10, 23, 22, 17, 9, 7, 1…
$ long_len <dbl> 5, 11, 6, 8, 7, 13, 7, 20, 9, 7, 14, 10, 27, 35, 19, 9, 7, …
$ abbrev_len <dbl> 5, 4, 4, 4, 4, 5, 4, 6, 4, 4, 9, 4, 7, 10, 6, 4, 5, 4, 4, 5…
$ tiny <dbl> 4, NA, NA, NA, NA, 5, 5, NA, NA, NA, 3, NA, NA, 2, 4, NA, N…
$ homepart <dbl> NA, 1, 1, NA, 1, NA, 1, 1, 1, 1, NA, 1, NA, NA, 1, 1, 1, 1,…
$ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-69.89912 1..., MULTIPOLYGON (…
Our data has a lot of variables, which we don’t need. It seems that the variable adm0_a3 might be a good variable to keep: we will use this variable to merge our clean data frame. It might also be helpful to keep admin, continent, sovereignt. These will help us identify some smaller geographic units (islands) which fall under larger jurisdictions.
world2<-subset(world, select = c(admin, adm0_a3, sovereignt, continent))
glimpse(world2)Rows: 241
Columns: 5
$ admin <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ adm0_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ sovereignt <chr> "Netherlands", "Afghanistan", "Angola", "United Kingdom", "…
$ continent <chr> "North America", "Asia", "Africa", "North America", "Europe…
$ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-69.89912 1..., MULTIPOLYGON (…
We now have a spatial dataframe with fewer variables. As mentioned, it seems that a good variable to merge on is adm0_a3. Let us look at both:
ctries_sp<-world2$adm0_a3
ctries_sp [1] "ABW" "AFG" "AGO" "AIA" "ALB" "ALD" "AND" "ARE" "ARG" "ARM" "ASM" "ATA"
[13] "ATC" "ATF" "ATG" "AUS" "AUT" "AZE" "BDI" "BEL" "BEN" "BFA" "BGD" "BGR"
[25] "BHR" "BHS" "BIH" "BLM" "BLR" "BLZ" "BMU" "BOL" "BRA" "BRB" "BRN" "BTN"
[37] "BWA" "CAF" "CAN" "CHE" "CHL" "CHN" "CIV" "CMR" "COD" "COG" "COK" "COL"
[49] "COM" "CPV" "CRI" "CUB" "CUW" "CYM" "CYN" "CYP" "CZE" "DEU" "DJI" "DMA"
[61] "DNK" "DOM" "DZA" "ECU" "EGY" "ERI" "ESP" "EST" "ETH" "FIN" "FJI" "FLK"
[73] "FRA" "FRO" "FSM" "GAB" "GBR" "GEO" "GGY" "GHA" "GIN" "GMB" "GNB" "GNQ"
[85] "GRC" "GRD" "GRL" "GTM" "GUM" "GUY" "HKG" "HMD" "HND" "HRV" "HTI" "HUN"
[97] "IDN" "IMN" "IND" "IOA" "IOT" "IRL" "IRN" "IRQ" "ISL" "ISR" "ITA" "JAM"
[109] "JEY" "JOR" "JPN" "KAS" "KAZ" "KEN" "KGZ" "KHM" "KIR" "KNA" "KOR" "KOS"
[121] "KWT" "LAO" "LBN" "LBR" "LBY" "LCA" "LIE" "LKA" "LSO" "LTU" "LUX" "LVA"
[133] "MAC" "MAF" "MAR" "MCO" "MDA" "MDG" "MDV" "MEX" "MHL" "MKD" "MLI" "MLT"
[145] "MMR" "MNE" "MNG" "MNP" "MOZ" "MRT" "MSR" "MUS" "MWI" "MYS" "NAM" "NCL"
[157] "NER" "NFK" "NGA" "NIC" "NIU" "NLD" "NOR" "NPL" "NRU" "NZL" "OMN" "PAK"
[169] "PAN" "PCN" "PER" "PHL" "PLW" "PNG" "POL" "PRI" "PRK" "PRT" "PRY" "PSX"
[181] "PYF" "QAT" "ROU" "RUS" "RWA" "SAH" "SAU" "SDN" "SDS" "SEN" "SGP" "SGS"
[193] "SHN" "SLB" "SLE" "SLV" "SMR" "SOL" "SOM" "SPM" "SRB" "STP" "SUR" "SVK"
[205] "SVN" "SWE" "SWZ" "SXM" "SYC" "SYR" "TCA" "TCD" "TGO" "THA" "TJK" "TKM"
[217] "TLS" "TON" "TTO" "TUN" "TUR" "TWN" "TZA" "UGA" "UKR" "URY" "USA" "UZB"
[229] "VAT" "VCT" "VEN" "VGB" "VIR" "VNM" "VUT" "WLF" "WSM" "YEM" "ZAF" "ZMB"
[241] "ZWE"
ctries_sp<-world2$adm0_a3
glimpse(ctries_sp) chr [1:241] "ABW" "AFG" "AGO" "AIA" "ALB" "ALD" "AND" "ARE" "ARG" "ARM" ...
ctries_df<-clean_life_expectancy_df$Code
glimpse(ctries_df) chr [1:235] "AFG" "ALB" "DZA" "ASM" "AND" "AGO" "AIA" "ATG" "ARG" "ARM" ...
Let us now examine whether there are differences when it comes to what is included/excluded in the two lists. We can do this with the help of the setdiff function, which indicates which elements of a vector or data frame X are not existent in a vector or data frame Y.
#which elements in ctries_sp are not existent ctries_df?
countries_dif<-setdiff(ctries_sp, ctries_df)
#printing them
countries_dif [1] "ALD" "ATA" "ATC" "ATF" "BLM" "CYN" "HMD" "IOA" "IOT" "KAS" "KOS" "NFK"
[13] "PCN" "PSX" "SAH" "SDS" "SGS" "SOL"
These are countries that are in the spatial dataframe, but not in the our dataframe. What are these countries?
#Within the dataframe world2, we select the adm0_a3 and check
#if it matches elements in countries_dif
world3<-subset(world2, adm0_a3 %in% countries_dif)
#We investigate the first 10 entries
head(world3, n = 10)#We make a list for better viewing of those
list(world3$admin)[[1]] [1] "Aland"
[2] "Antarctica"
[3] "Ashmore and Cartier Islands"
[4] "French Southern and Antarctic Lands"
[5] "Saint Barthelemy"
[6] "Northern Cyprus"
[7] "Heard Island and McDonald Islands"
[8] "Indian Ocean Territories"
[9] "British Indian Ocean Territory"
[10] "Siachen Glacier"
[11] "Kosovo"
[12] "Norfolk Island"
[13] "Pitcairn Islands"
[14] "Palestine"
[15] "Western Sahara"
[16] "South Sudan"
[17] "South Georgia and South Sandwich Islands"
[18] "Somaliland"
Let us now perform a left merge so that we can have the data that we cleaned onto the spatial dataframe.
#Performing a left join
merged<-left_join(world2, clean_life_expectancy_df, by = c("adm0_a3" = "Code"))
#Investigating the first 4 entries.
glimpse(merged, n = 4)Rows: 241
Columns: 7
$ admin <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania",…
$ adm0_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", …
$ sovereignt <chr> "Netherlands", "Afghanistan", "Angola", "United Kingdom"…
$ continent <chr> "North America", "Asia", "Africa", "North America", "Eur…
$ Entity <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania",…
$ life_exp_mean <dbl> 70.25972, 45.38333, 45.08466, 69.44028, 68.28611, NA, 77…
$ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-69.89912 1..., MULTIPOLYGO…
This looks good. Let us now make our first map.
ggplot() +
geom_sf(data = merged, aes(fill = life_exp_mean))+
ggtitle("Avg. Life Expectancy 1901-2021")We have a just made a choropleth map, which indicates intensity of a variable. Let us now add a few embellishments to make our map nicer.
ggplot() +
geom_sf(data = merged, aes(fill = life_exp_mean))+
theme_bw()+
xlab("X - Longitude") + ylab("Y - Latitude")+
scale_y_continuous(breaks=seq(-90, 90, by = 10), limits = c(-90,90))+
ggtitle("Avg. Life Expectancy 1901-2021")A map like this has exactly the same structure of a regular ggplot graph.
5 Performing a t-test
Let us now go back to our original task, that of calculating a t-test. Because we have this new variable - continent, we no longer need to manually ascribe a continent to our countries. Let us quickly see our labels for the continents.
unique(merged$continent)[1] "North America" "Asia"
[3] "Africa" "Europe"
[5] "South America" "Oceania"
[7] "Antarctica" "Seven seas (open ocean)"
We can use the “South America” label. Note that this does not capture all the countries which we previously referred to as “Latin America.” For example, Mexico is not included here, as it is part of North America. This is to say that Latin America can be referred to as both aa geographic and a linguistic space.
sample_latam<-subset(merged, continent=="South America")Let us quickly map South America.
ggplot() +
geom_sf(data = sample_latam, aes(fill = life_exp_mean))+
theme_bw()+
xlab("X - Longitude") + ylab("Y - Latitude")+
ggtitle("Avg. Life Expectancy 1901-2021")We are now ready to perform a t-test to see if the life expectancy in Latin America is different from life expectancy around the world. We should perform one more step: excluding Latin America from our world sample so that we are not comparing the Latin American sample to the world sample (which includes Latin America).
#Choose everything that is not South America
world_nolatam<-subset(merged, continent!="South America")
#Examining the result
glimpse(world_nolatam)Rows: 228
Columns: 7
$ admin <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania",…
$ adm0_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", …
$ sovereignt <chr> "Netherlands", "Afghanistan", "Angola", "United Kingdom"…
$ continent <chr> "North America", "Asia", "Africa", "North America", "Eur…
$ Entity <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania",…
$ life_exp_mean <dbl> 70.25972, 45.38333, 45.08466, 69.44028, 68.28611, NA, 77…
$ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-69.89912 1..., MULTIPOLYGO…
Note that our sample got reduced from 241 (size of world sample including Latin America) to 228 (size of world sample excluding Latin America). Let us now perform a t-test.
test_greater<-t.test(sample_latam$life_exp_mean,
mu=mean(world_nolatam$life_exp_mean, na.rm=T),
sigma.x=sd(sample_latam$life_exp_mean),
alternative = c("greater"))
test_greater
One Sample t-test
data: sample_latam$life_exp_mean
t = 0.77919, df = 12, p-value = 0.2255
alternative hypothesis: true mean is greater than 62.35303
95 percent confidence interval:
61.1916 Inf
sample estimates:
mean of x
63.25522
The result indicates that we do not reject the H0 because p is larger than 0.05, so the mean life expectancy for Latin America (as defined here), is not greater than that of the world. In other words, there isn’t sufficient evidence to conclude that the true mean life expectancy is greater than 62.3530333
But is it less?
test_less<-t.test(sample_latam$life_exp_mean,
mu=mean(world_nolatam$life_exp_mean, na.rm=T),
sigma.x=sd(sample_latam$life_exp_mean),
alternative = c("less"))
test_less
One Sample t-test
data: sample_latam$life_exp_mean
t = 0.77919, df = 12, p-value = 0.7745
alternative hypothesis: true mean is less than 62.35303
95 percent confidence interval:
-Inf 65.31885
sample estimates:
mean of x
63.25522
No, it does not seem to be the case. There isn’t sufficient evidence to conclude that the true mean life expectancy is less than 62.3530333.
Is it equal to that of the world?
test_twosided<-t.test(sample_latam$life_exp_mean,
mu=mean(world_nolatam$life_exp_mean, na.rm=T),
sigma.x=sd(sample_latam$life_exp_mean),
alternative = c("two.sided"))
test_twosided
One Sample t-test
data: sample_latam$life_exp_mean
t = 0.77919, df = 12, p-value = 0.451
alternative hypothesis: true mean is not equal to 62.35303
95 percent confidence interval:
60.73248 65.77796
sample estimates:
mean of x
63.25522
Yes, statistically it is similar. This test prevents us from rejecting the H0. Thus, there isn’t sufficient evidence to conclude that the true mean life expectancy differs significantly from the mean proposed, 62.3530333.
Another good way to visualize the similarity between the two groups is by using a boxplot.
#Creating a new variable - sample with NA values
merged$sample<-NA
#Replace all the NA values with "Rest of the World"
merged$sample<-"Rest of the World"
#Replace the values with South America, if continent=="South America"
merged$sample[merged$continent=="South America"]<-"South America"ggplot(merged, aes(x = sample, y = life_exp_mean, color = sample)) +
geom_boxplot() +
theme_bw()And this is the interpretation of a box plot.
We can also add more of the observations to the boplot by using geom_jitter, as shown below:
ggplot(merged, aes(x = sample, y = life_exp_mean, color = sample)) +
geom_boxplot() +
geom_jitter() +
theme_bw()6 Mapping Life Expectancy in Europe
What if we wanted to examine life expectancy in Europe over time? The first step is to fix the coordinates and add limits to our graph. This time around we need to choose a maximum Y (North Point), minimum Y (South Point), a maximum X (West Point), and minimum X (East Point). One good way would be to take these coordinates from the existing countries. Let us choose four countries that would allow us to decide on the coordinates for Europe: Norway, Portugal, Ukraine, and Greece.
#Step1: Selecting the country
norway<-subset(merged, Entity == "Norway")
#Step2: Selecting the geometry
nc_geom <- st_geometry(norway)
#Step3: Selecting the coordinate of all the geometry vertices
list_points_norway<-nc_geom[[1]]
#Step4: Turning them into a list of matrices
list_points_norway<-flatten(list_points_norway)
#Step5: Creating a dataframe
list_points_norway2<-as.data.frame(do.call(rbind, list_points_norway))
#Step6: Labeling lon_x and lat_y
names(list_points_norway2)<-c("lon_x", "lat_y")
#Step7: Examining the first 10 entries
head(list_points_norway2, n=10)For our map of Europe we are interested in the Northern most point in Norway. This will be our maximum Y.
max_lat_y<-max(list_points_norway2$lat_y)
max_lat_y[1] 80.47783
Let us now get another country that is in the south: Greece.
#Step1: Selecting the country
greece<-subset(merged, Entity == "Greece")
#Step2: Selecting the geometry
nc_geom <- st_geometry(greece)
#Step3: Selecting the coordinate of all the geometry vertices
list_points_greece<-nc_geom[[1]]
#Step4: Turning them into a list of matrices
list_points_greece<-flatten(list_points_greece)
#Step5: Creating a dataframe
list_points_greece2<-as.data.frame(do.call(rbind, list_points_greece))
#Step6: Labeling lon_x and lat_y
names(list_points_greece2)<-c("lon_x", "lat_y")
#Step7: Examining the first 10 entries
head(list_points_greece2, n=10)For our map of Europe we are interested in the Southern most point in Greece. This will be the minimum Y.
min_lat_y<-min(list_points_greece2$lat_y)
min_lat_y[1] 34.93447
Let us now get a country in the East: Ukraine.
#Step1: Selecting the country
ukraine<-subset(merged, Entity == "Ukraine")
#Step2: Selecting the geometry
nc_geom <- st_geometry(ukraine)
#Step3: Selecting the coordinate of all the geometry vertices
list_points_ukraine<-nc_geom[[1]]
#Step4: Turning them into a list of matrices
list_points_ukraine<-flatten(list_points_ukraine)
#Step5: Creating a dataframe
list_points_ukraine2<-as.data.frame(do.call(rbind, list_points_ukraine))
#Step6: Labeling lon_x and lat_y
names(list_points_ukraine2)<-c("lon_x", "lat_y")
#Step7: Examining the first 10 entries
head(list_points_ukraine2, n=10)For our map of Europe we are interested in the Eastern most point in Ukraine. This will be the maximum X.
max_lon_x<-max(list_points_ukraine2$lon_x)
max_lon_x[1] 40.12832
Let’s choose now a country in the West: Portugal
#Step1: Selecting the country
portugal<-subset(merged, Entity == "Portugal")
#Step2: Selecting the geometry
nc_geom <- st_geometry(portugal)
#Step3: Selecting the coordinate of all the geometry vertices
list_points_portugal<-nc_geom[[1]]
#Step4: Turning them into a list of matrices
list_points_portugal<-flatten(list_points_portugal)
#Step5: Creating a dataframe
list_points_portugal2<-as.data.frame(do.call(rbind, list_points_portugal))
#Step6: Labeling lon_x and lat_y
names(list_points_portugal2)<-c("lon_x", "lat_y")
#Step7: Examining the first 10 entries
head(list_points_portugal2, n=10)For our map of Europe we are interested in the Western most point in Portugal. This will be the minimum X.
min_lon_x<-min(list_points_portugal2$lon_x)
min_lon_x[1] -31.28296
We are now ready:
figure1<-ggplot() +
geom_sf() +
geom_sf(data = merged, aes(fill = life_exp_mean))+
theme_bw()+
xlab("X - Longitude") + ylab("Y - Latitude")+
coord_sf(xlim = c(min_lon_x, max_lon_x), ylim = c(min_lat_y, max_lat_y))+
ggtitle("Avg. Life Expectancy 1901-2021")
ggsave(figure1, file = "./figure1.jpg",
height = 20, width = 20,
units = "cm", dpi = 300)
figure1Let us also visualize the points that we calculated. But to make things easier, we should create a dataframe.
euro_extreme<-data.frame(x_lon=c(min_lon_x, max_lon_x, min_lon_x, max_lon_x),
y_lat=c(min_lat_y, max_lat_y, max_lat_y, min_lat_y),
name = (c("South-West", "North-East", "North-West", "South-East")))
euro_extremeLet us visualize the four points that we calculated.
figure1<-ggplot() +
geom_sf() +
geom_sf(data = merged, aes(fill = life_exp_mean))+
geom_point(data= euro_extreme, aes(x=x_lon,
y=y_lat),
fill="blue", color="darkred", size=3)+
geom_label_repel(data= euro_extreme, aes(x=x_lon, y=y_lat,
label = name), fill = alpha(c("white"),0.8))+
theme_bw()+
xlab("X - Longitude") + ylab("Y - Latitude")+
coord_sf(xlim = c(min_lon_x-3, max_lon_x+3), ylim = c(min_lat_y-3, max_lat_y+3))+
ggtitle("Avg. Life Expectancy 1901-2021")
figure1This looks pretty good. Next step is to redo the merge in our data. Remember that we calculated average life expectancy for the entire period: historical years up to the present day. Thus, we need to do a new merge in which we merge the spatial dataframe to the specific year on life expectancy.
#Making a more friendly label for life expectancy
names(life_expectancy_df)[4]<-"life_exp_mean"
#Selecting only 1950
life_expectancy_1950<-subset(life_expectancy_df, Year==1950)
#Merging countries (life_expectancy_1950) to the shapefile (world2)
world_1950<-left_join(world2, life_expectancy_1950, by = c("adm0_a3"="Code"))
#Examining the result
glimpse(world_1950)Rows: 241
Columns: 8
$ admin <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania",…
$ adm0_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", …
$ sovereignt <chr> "Netherlands", "Afghanistan", "Angola", "United Kingdom"…
$ continent <chr> "North America", "Asia", "Africa", "North America", "Eur…
$ Entity <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania",…
$ Year <int> 1950, 1950, 1950, 1950, 1950, NA, 1950, 1950, 1950, 1950…
$ life_exp_mean <dbl> 57.2, 27.7, 36.3, 55.3, 44.7, NA, 64.6, 41.1, 61.2, 59.3…
$ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-69.89912 1..., MULTIPOLYGO…
What if we were interested in life expectancy in Europe in 1950?
Let us map life expectancy in 1950.
figure2<-ggplot() +
geom_sf(data = world_1950, aes(fill = life_exp_mean))+
theme_bw()+
xlab("X - Longitude") + ylab("Y - Latitude")+
coord_sf(xlim = c(min_lon_x, max_lon_x), ylim = c(min_lat_y, max_lat_y))+
ggtitle("Life Expectancy in 1950")
figure2Let us map life expectancy in 2020.
#Selecting only 2020
life_expectancy_2020<-subset(life_expectancy_df, Year==2020)
#Merging countries (life_expectancy_2020) to the shapefile (world2)
world_2020<-left_join(world2, life_expectancy_2020, by = c("adm0_a3"="Code"))
#Examining the result
glimpse(world_2020)Rows: 241
Columns: 8
$ admin <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania",…
$ adm0_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", …
$ sovereignt <chr> "Netherlands", "Afghanistan", "Angola", "United Kingdom"…
$ continent <chr> "North America", "Asia", "Africa", "North America", "Eur…
$ Entity <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania",…
$ Year <int> 2020, 2020, 2020, 2020, 2020, NA, 2020, 2020, 2020, 2020…
$ life_exp_mean <dbl> 75.7, 62.6, 62.3, 76.9, 77.0, NA, 79.0, 78.9, 75.9, 72.2…
$ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-69.89912 1..., MULTIPOLYGO…
figure3<-ggplot() +
geom_sf() +
geom_sf(data = world_2020, aes(fill = life_exp_mean))+
theme_bw()+
xlab("X - Longitude") + ylab("Y - Latitude")+
coord_sf(xlim = c(min_lon_x, max_lon_x),
ylim = c(min_lat_y, max_lat_y))+
ggtitle("Life Expectancy in 2020")
figure3Let us put them side by side.
grid.arrange(figure2, figure3, ncol=2)We don’t see much of a difference because we also need to fix the color scheme in a way that has the same minmum and maximum. This is how we do this. We first identify the minimum and maximum value in the variable that we want to plot.
vmax <- max(life_expectancy_df$life_exp_mean, na.rm=T)
vmax[1] 86.5
vmin <- min(life_expectancy_df$life_exp_mean, na.rm=T)
vmin[1] 12
Let us now replot the maps
library(ggpubr)
figure2<-ggplot() +
geom_sf() +
geom_sf(data = world_1950, aes(fill = life_exp_mean))+
theme_bw()+
xlab("X - Longitude") + ylab("Y - Latitude")+
coord_sf(xlim = c(min_lon_x, max_lon_x), ylim = c(min_lat_y, max_lat_y))+
scale_fill_gradient(limits = c(vmin, vmax), name = "Life Expectancy")+
ggtitle("Life Expectancy in 1950")
figure3<-ggplot() +
geom_sf() +
geom_sf(data = world_2020, aes(fill = life_exp_mean))+
theme_bw()+
xlab("X - Longitude") + ylab("Y - Latitude")+
coord_sf(xlim = c(min_lon_x, max_lon_x),
ylim = c(min_lat_y, max_lat_y))+
scale_fill_gradient(limits = c(vmin, vmax), name = "Life Expectancy")+
ggtitle("Life Expectancy in 2020")
ggarrange(figure2, figure3, ncol=2, common.legend = TRUE)Now, the differences are more visible. What if we were interested in plotting the change in life expectancy between 1950 and 2020?
#Step1: Selecting only 1950 and 2002
life_expectancy_1950_2020<-subset(life_expectancy_df, Year %in% c(1950, 2020))
#Step2: Calculating the difference betweem the 2020 (life_expectancy_values) and 1950 (lag(life_expectancy_values)) by country
life_expectancy_1950_2020b<-life_expectancy_1950_2020%>%
dplyr::group_by(Code) %>%
dplyr::summarise(differ=life_exp_mean - lag(life_exp_mean))
#Step3: Removing countries with NA code and where is the difference calculated is NA
life_expectancy_1950_2020c<-subset(life_expectancy_1950_2020b, Code!="" & !is.na(differ))#Step4: Examining the result
glimpse(life_expectancy_1950_2020c)Rows: 237
Columns: 2
Groups: Code [237]
$ Code <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "AND", "ARE", "ARG", "ARM", …
$ differ <dbl> 18.5, 34.9, 26.0, 21.6, 32.3, 14.4, 37.8, 14.7, 12.9, 11.4, 21.…
Let us now plot the change.
#Performing a left join
world_change<-left_join(world2, life_expectancy_1950_2020c, by = c("adm0_a3"="Code"))
#Investigating the result
glimpse(world_change)Rows: 241
Columns: 6
$ admin <chr> "Aruba", "Afghanistan", "Angola", "Anguilla", "Albania", "A…
$ adm0_a3 <chr> "ABW", "AFG", "AGO", "AIA", "ALB", "ALD", "AND", "ARE", "AR…
$ sovereignt <chr> "Netherlands", "Afghanistan", "Angola", "United Kingdom", "…
$ continent <chr> "North America", "Asia", "Africa", "North America", "Europe…
$ differ <dbl> 18.5, 34.9, 26.0, 21.6, 32.3, NA, 14.4, 37.8, 14.7, 12.9, 1…
$ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-69.89912 1..., MULTIPOLYGON (…
figure4<-ggplot() +
geom_sf() +
geom_sf(data = world_change, aes(fill = differ))+
theme_bw()+
xlab("X - Longitude") + ylab("Y - Latitude")+
coord_sf(xlim = c(min_lon_x, max_lon_x), ylim = c(min_lat_y, max_lat_y))+
ggtitle("Life Expectancy Differece: 2020-1950")
figure4What is the country in Europe with the highest change in life expectancy between 1950 and 2020?
#Step1: Selecting Europe
world_change_res<-subset(world_change, continent=="Europe")
#Step2: Selecting the maximum value for differ (difference between 2020 and 1950)
world_change_resx<-subset(world_change_res, differ==max(world_change_res$differ, na.rm=T))
#Step3: Identifying the country
world_change_resx$admin[1] "Albania"
What is the country in Europe with the lowest change in life expectancy between 1950 and 2020?
#Step2: Selecting the minimum value for differ (difference between 2020 and 1950)
world_change_resx<-subset(world_change_res, differ==min(world_change_res$differ, na.rm=T))
#Step3: Identifying the country
world_change_resx$admin[1] "Latvia"
7 Correlation between two Variables
Let us now calculate the correlation between urbanization and life expectancy.
#Providing more friendly names
names(urbanization_df)[4]<-"urb_mean"#Selecting only relevant variables
urbanization_df2<-subset(urbanization_df, select=c(Code, urb_mean, Year))#Performing a left join
merged_df<-left_join(life_expectancy_df, urbanization_df2, by = c("Code"="Code", "Year"="Year"))
#Removing NA values
merged_df2<-na.omit(merged_df)What is the correlation in life expectancy and urbanization for the entire dataset?
#Calculating the correlation
cor(merged_df2$life_exp_mean, merged_df2$urb_mean)[1] 0.4579985